{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "pd.set_option('display.max_columns', None)\n",
    "import numpy as np\n",
    "import ast \n",
    "import matplotlib.pyplot as plt \n",
    "import seaborn as sns \n",
    "import yaml\n",
    "from tqdm import tqdm\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "with open(\"./task_specs\", \"r\") as file:\n",
    "    task_specs = yaml.load(file.read())    \n",
    "import csop_helper as csop\n",
    "import importlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "phase=1 \n",
    "\n",
    "df = pd.read_csv(\"./rounds_data_phase1_raw.csv\")\n",
    "df['complexity_cat'] = pd.Categorical(df['complexity'], categories=['Very low', 'Low', 'Moderate', 'High', 'Very high'], ordered=True)\n",
    "df['block_cat'] = pd.Categorical(df['block'], categories=['ll', 'lh', 'ml', 'mh', 'hl', 'hh'], ordered=True)\n",
    "df['skill_cat'] = pd.Categorical([x[0] for x in df['block']], categories=['l', 'm', 'h'], ordered=True)\n",
    "df['social_cat'] = pd.Categorical([x[1] for x in df['block']], categories=['l', 'h'], ordered=True)\n",
    "\n",
    "df['parsed_log'] = df['log'].apply(lambda x: ast.literal_eval(x))\n",
    "df['parsed_intermediateSolutions'] = df['intermediateSolutions'].apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['round_data'] = df.apply(lambda x: csop.generate_round_data(x, phase=phase), axis=1)\n",
    "df.loc[[x.subject_id.values[0] is None for x in df.round_data.values], \"corrupted_data\"] = True\n",
    "df.loc[[x.subject_id.values[0] is not None for x in df.round_data.values], \"corrupted_data\"] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "6116it [00:27, 222.79it/s]\n"
     ]
    }
   ],
   "source": [
    "round_summaries = {}\n",
    "for row in tqdm(df.itertuples()): \n",
    "    try:\n",
    "        round_summary = csop.phase1_round_features(row.round_data, row.parsed_log, row.task_index, task_specs, phase=phase)\n",
    "        round_summaries.update({row.round_id:round_summary})\n",
    "    except: \n",
    "        print(row.round_id)\n",
    "        raise \n",
    "\n",
    "df_round_summaries = pd.DataFrame(list(round_summaries.values()))\n",
    "df_round_summaries['round_id'] = list(round_summaries.keys())\n",
    "\n",
    "temp_len_df = len(df)\n",
    "df_export = df.merge(df_round_summaries, on=\"round_id\", how=\"inner\")\n",
    "assert temp_len_df == len(df_export)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_export.to_pickle(\"./phase_1_processed.pkl\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py38",
   "language": "python",
   "name": "py38"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
